////////////////////////////////////////////////////////////////////////////////
*Cleaning GSS data (US)
////////////////////////////////////////////////////////////////////////////////
cd "$usdata"

use "raw/gss_raw.dta", clear

*Type of worker variables	
	gen employed=1 if wrkstat==1|wrkstat==2
	recode employed .=0
	gen parttime=1 if wrkstat==2
	replace parttime=0 if wrkstat==1
	gen semp=1 if wrkslf==1
	replace semp=0 if wrkslf==2
	gen semp_boss=1 if numemps<.
	replace satjob=satjobhv if satjobhv<.&satjob>=.
	replace satjob=satjob1 if satjob>=.&satjob1<.

*Single-digit industry indicator	
	gen naics=indus10
	replace naics=indus07 if naics==.
	tostring naics, gen(naics1)
	gen onedigit_naics=substr(naics1, 1, 1)
	destring onedigit_naics, replace
	drop naics1
	
*Demographic variables
	gen age_squared=(age/10)^2
	gen male=1 if sex==1
	replace male=0 if sex==2
	rename degree education
	gen insecure=1 if joblose==1|joblose==2
	replace insecure=0 if joblose==3|joblose==4
	gen nonwhite=1 if race==2|race==3
	replace nonwhite=0 if race==1
	gen immigrant=1 if born==2|bornsp==2
	replace immigrant=0 if born==1|(bornsp>1&bornsp<.)	

*Union variable	
	gen union_t=1 if union==1|union==3|union1==1|union1==3
	replace union_t=0 if union==2|union==4|union1==2|union1==4
	drop union union1
	rename union_t union

*Education variable	
	gen uni=1 if education>=3
	replace uni=0 if education<=2
	
	label var year "Year"
	keep year employed uni satjob semp naics onedigit_naics union parttime age ///
age_squared male education wtssall joblose insecure nonwhite immigrant marital

gen insample=!missing(semp, year, onedigit_naics, age, age_squared, male, ///
	union, insecure, uni, nonwhite, immigrant)&employed==1&year>=1977& ///
	!inlist(year, 1980, 1984, 1987)&age<=65

save "clean/gss_clean.dta", replace

////////////////////////////////////////////////////////////////////////////////
*Cleaning HRS data
////////////////////////////////////////////////////////////////////////////////

// warning: the raw data files for this are an absolute nightmare and need to be
// located in a specific place on your C: drive for the underlying .dct files to
// work. Best to just use the .dta file that comes out at the end of this. We 
// therefore comment out the first step of this process

/*
*2002-2018
foreach i in 02 04 06 08 10 12 14 16 18 {
	cd "C:/h`i'sta"
	qui infile using "H`i'P_R", clear
	cd "$usdata"
	save "temp/insecurity`i'.dta", replace
}

*1996-2000
foreach i in 96 98 00 {
	cd "C:/h`i'sta"
	infile using "H`i'H_R", clear
	cd "$usdata"
	save "temp/insecurity`i'.dta", replace
}

*1994
cd "C:/h94sta"
infile using "W2C", clear
cd "$usdata"
save "temp/insecurity94.dta", replace

*Variable has a different name every year

use "temp/insecurity94.dta", clear
	rename W5801 insecurity
	recode insecurity 101/999=.
	gen year=1994
save "temp/insecurity94.dta", replace

use "temp/insecurity96.dta", clear
	rename E3788 insecurity
	recode insecurity 101/999=.
	gen year=1996
save "temp/insecurity96.dta", replace

use "temp/insecurity98.dta", clear
	rename F4583 insecurity
	recode insecurity 101/999=.
	gen year=1998
save "temp/insecurity98.dta", replace

use "temp/insecurity00.dta", clear
	rename G4996 insecurity
	recode insecurity 101/999=.
	gen year=2000
save "temp/insecurity00.dta", replace

use "temp/insecurity02.dta", clear
	rename HP014 insecurity
	recode insecurity 101/999=.
	gen year=2002
save "temp/insecurity02.dta", replace

use "temp/insecurity04.dta", clear
	rename JP014 insecurity
	recode insecurity 101/999=.
	gen year=2004
save "temp/insecurity04.dta", replace

use "temp/insecurity06.dta", clear
	rename KP014 insecurity
	recode insecurity 101/999=.
	gen year=2006
save "temp/insecurity06.dta", replace

use "temp/insecurity10.dta", clear
	rename MP014 insecurity
	recode insecurity 101/999=.
	gen year=2010
save "temp/insecurity10.dta", replace

use "temp/insecurity12.dta", clear
	rename NP014 insecurity
	recode insecurity 101/999=.
	gen year=2012
save "temp/insecurity12.dta", replace

use "temp/insecurity14.dta", clear
	rename OP014 insecurity
	recode insecurity 101/999=.
	gen year=2014
save "temp/insecurity14.dta", replace

use "temp/insecurity16.dta", clear
	rename PP014 insecurity
	recode insecurity 101/999=.
	gen year=2016
save "temp/insecurity16.dta", replace

use "temp/insecurity18.dta", clear
	rename QP014 insecurity
	recode insecurity 101/999=.
	gen year=2018
save "temp/insecurity18.dta", replace

clear
foreach i in 94 96 98 00 02 04 06 10 12 14 16 18 {
	append using "temp/insecurity`i'.dta"
	cap erase "temp/insecurity`i'.dta"
}

destring HHID, replace
destring PN, replace
gen double hhidpn=HHID*1000+PN
keep hhidpn year insecurity

save "raw/insecurity.dta", replace
*/

*RAND Corp. has put together a longitudinal dataset using HRS variables

clear
clear matrix
clear mata
set maxvar 20000

cd "$usdata"

use "raw/rand.dta", clear

*Drop all variables not related to respondent
keep r*
rename rahhidpn hhidpn

*Change from wide to long format
forval i=10/14 {
	rename r`i'* r*`i'
}

forval i=1/9 {
	rename r`i'* r*`i'
}

rename r* *

keep slfemp* jcten* aeduc* jcind* jhours* agey_m* lbrf* mstat* abplace* union* unionf* agender* iearn* aracem* wtresp* abplace hhidpn

drop mstath* mstatf* lbrfh* lbrfy* 

reshape long slfemp jcten jcind jcindb jcindc jhours agey_m lbrf mstat union unionf wtresp iearn, i(hhidpn) j(year)

recode year 1=1992 2=1994 3=1996 4=1998 5=2000 6=2002 7=2004 8=2006 9=2008 10=2010 11=2012 12=2014 13=2016 14=2018

destring hhidpn, replace

*Merge insecurity answers in
merge 1:1 hhidpn year using "raw/insecurity.dta"
	drop if _merge==2
	drop _merge

egen id=group(hhidpn)
xtset id year, delta(2)

*Generate unemployment indicator
gen employed=inlist(lbrf,1,2,4)
gen participant = inlist(lbrf, 1, 2, 3, 4)
gen unemp=1 if (lbrf == 3)
replace unemp=0 if inlist(lbrf, 1,2,4)
replace unemp=1 if jcten<L.jcten&lbrf!=5&lbrf!=4

*Generate insecurity variable
replace insecurity=insecurity/100

*Part-time worker
gen parttime=(lbrf==2)

*Self-employed
gen semp=(slfemp==1)

*Age
rename agey_m age
gen age_squared = (age^2)/100

*Industry codes
gen industry=.
replace industry = 1 if jcind==1|jcindb==1|jcindc==1
replace industry = 2  if jcind==2|inlist(jcindb,2,4)|inlist(jcindc,2,4)
replace industry = 3 if inlist(jcind,3,4)|jcindb==5|jcindc==5
replace industry = 4  if jcind==5|inlist(jcindb,3,8)|inlist(jcindc,3,8)
replace industry = 5  if jcind==6|jcindb==6|jcindc==6
replace industry = 6  if jcind==7|jcindb==7|jcindc==7
replace industry = 7  if jcind==8|inlist(jcindb,10,11)|inlist(jcindc,10,11)
replace industry = 8  if inlist(jcind,9,10)|inlist(jcindb,17,18)|inlist(jcindc,17,18)
replace industry = 9  if jcind==11|jcindb==16|jcindc==16
replace industry = 10  if jcind==12|inlist(jcindb,9,12,13,14,15)|inlist(jcindc,9,12,13,14,15)
replace industry = 11  if jcind==13|jcindb==19|jcindc==19

la def industries 1 "agriculture/forest/fish/hunting" 2 "mining&construction" 3 "manufacturing" 4 "transportation/warehousing/utilities" 5 "wholesale" 6 "retail" 7 "finance/insurance/real estate" 8 "business/repair/personal services" 9 "entertainment/recreation" 10 "Professional/information services" 11 "Public administration/military"

la values industry industries

*Gender
gen male = (agender == 1)

*Union 
gen new = (union == 1)
drop union
rename new union

*Race
gen nonwhite = (aracem != 1)& (aracem <= .)

*Immigration status
gen immigrant = (abplace == 11)

*Education
gen uni = (aeduc >= 4)

*Marital status
gen marital = 1 if inrange(mstat,1,3)
replace marital = 2 if mstat==7
replace marital = 3 if mstat==5|mstat==6
replace marital = 4 if mstat==4
replace marital = 5 if mstat==8
la define statuses 1 "married" 2 "widowed" 3 "divorced" 4 "separated" 5 "never married"
la values marital statuses

keep hhidpn year age wtresp industry insecurity id unemp parttime semp age age_squared male union nonwhite immigrant uni marital participant employed iearn

*2018 weights not yet available, so give each respondent previous year's weight
replace wtresp=L.wtresp if year==2018

gen insample=!missing(year, age, wtresp, insecurity, parttime, semp, age, age_squared, male, union, nonwhite, immigrant, uni, marital,industry)&employed==1&age<=65

save "clean/hrs_clean.dta", replace

////////////////////////////////////////////////////////////////////////////////

*Cleaning labor market flows from CPS, Shimer, and JOLTS

////////////////////////////////////////////////////////////////////////////////

cd "$usdata"

use "raw/cps_raw.dta", clear

	drop if year>2019
	drop if asecflag==1
	gen incoming=inlist(mish,1,5)
	gen modate=ym(year, month)
	format modate %tm
	gen double id=cpsidp
	drop if id==.
	format id %25.10g
	xtset id modate

// iu_t+1 = (1-F_t)*iu_t + iu^s_t+1
// iu_t+1 = iS_t*e_t + (1-F_t)*iu_t
// F_t = 1-(u_t+1-u^s_t+1)/u_t
// S_t = (u_t+1 - (1-F_t)*u_t)/e_t
// S_t = (u_t+1 - (u_t+1-u^s_t+1)/u_t*u_t)/e_t
// S_t = u^s_t+1/e_t

*Employment indicators
gen employed=inrange(empstat,10,12)
gen inv_sunemployed=1 if inrange(L.empstat,10,12)&inrange(empstat,21,22)&inrange(whyunemp,1,2)
recode inv_sunemployed .=0
gen layoff=1 if inrange(L.empstat,10,12)&inrange(empstat,21,22)&whyunemp==1
recode layoff .=0
gen sunemployed=1 if inrange(L.empstat,10,12)&inrange(empstat,21,22)
recode sunemployed .=0
gen dropout=inrange(L.empstat,10,12)&inrange(empstat,32,36)
gen switcher=1 if empsame==1&employed==1&L.employed==1
recode switcher .=0

replace employed=compwt*employed
replace inv_sunemployed=compwt*inv_sunemployed
replace layoff=compwt*layoff
replace sunemployed=compwt*sunemployed
replace dropout=compwt*dropout
replace switcher=compwt*switcher

collapse (sum) employed sunemployed inv_sunemployed layoff dropout switcher if !incoming, by(year month)
drop if year<1998
tempfile cps
save `cps'

*Merge in Fujita et al. (2020)'s job-to-job separation rates
import excel using "$usdata/raw/fujita_j2j.xlsx", firstrow sheet("Data") clear
keep year1 month1 FMP
rename (year1 month1 FMP) (year month j2j_fujita)

merge 1:1 month year using `cps'
keep if _merge==3
drop _merge

sort year month
gen modate=ym(year, month)
tsset modate

gen cps_total = ((F.sunemployed+F.dropout)/employed + F.j2j_fujita)
gen cps_unemp = (F.sunemployed/employed)
gen cps_inv = (F.inv_sunemployed/employed)
gen cps_layoff = (F.layoff/employed)
gen cps_dropout = (F.dropout/employed)
gen cps_switcher = F.j2j_fujita

drop if sunemployed==0

keep year month cps*
rename (cps_total cps_inv) (cps_total_new cps_inv_new)

save "temp/cps_flows_new.dta", replace

*JOLTS flows data
cd "$usdata"

import excel "raw/jolts_inv", sheet("BLS Data Series") cellrange(A14:M35) firstrow clear
	local i = 0
	foreach var of varlist Jan-Dec {
		local ++i
		rename `var' sep_rate`i'
	}
	rename Year year

reshape long sep_rate, i(year) j(month)
drop if sep_rate==.
replace sep_rate=sep/100
rename sep_rate jolts_inv
save "temp/jolts_inv.dta", replace

import excel "raw/jolts_total", sheet("BLS Data Series") cellrange(A14:M35) firstrow clear
	local i = 0
	foreach var of varlist Jan-Dec {
		local ++i
		rename `var' sep_rate`i'
	}
	rename Year year

reshape long sep_rate, i(year) j(month)
drop if sep_rate==.
replace sep_rate=sep/100
rename sep_rate jolts_total

merge 1:1 year month using "temp/jolts_inv.dta"
drop _merge

save "temp/jolts_flows.dta", replace

*Shimer flows
cd "$usdata"

import delimited "raw/sep-prob.txt", clear
	rename (v1 v2) (year sep_rate_i)
	replace year=substr(year, 2, .)
	destring year, replace
	sort year
	replace year=floor(year)
	bys year: gen month=_n
	recode month 2=4 3=7 4=10
	
	replace sep_rate_i=subinstr(sep_rate_i, "}", "",.)
	destring sep_rate_i, replace
	rename sep_rate shimer_total
	replace shimer_total = 1 - exp(-shimer_total)
save "temp/shimer_flows.dta", replace

*Put CPS, JOLTS, and Shimer flows together
use "temp/cps_flows_new.dta", clear

merge 1:1 year month using "temp/jolts_flows.dta"
drop if _merge==2
drop _merge
merge 1:1 year month using "temp/shimer_flows.dta"
drop _merge
sort year

collapse shimer_total cps_total_new cps_unemp cps_inv_new cps_layoff cps_dropout cps_switcher jolts_total jolts_inv, by(year)

save "clean/us_flows.dta", replace

cd "$usdata/temp"
local files: dir "`c(pwd)'" files "*"

foreach file of local files {
	cap erase `file'
}
